import os
import pandas as pd
import csv
import numpy as np
import math
from matplotlib.lines import Line2D
import matplotlib as mpl
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
import random
from scipy.spatial import distance
from scipy import stats
from sklearn import manifold
from sklearn.decomposition import PCA
import statsmodels.stats.multitest as sm

Summary

This file includes the results of a microbiome analysis performed on samples taken from four individuals that were originally used to determine the “Impact of DNA source on genetic variant detection from human whole-genome sequencing data”.

This included blood, saliva and buccal samples taken from four individuals (blood samples were taken at a different time than saliva and buccal samples). Additionally, a methylation-based enrichment for eukaryotic DNA was performed on the saliva and buccal samples.

Sections not completed:
- Absolute number of reads between different reference databases
- The intersection between different reference databases
- Functional profiling using HUMAnN2 (if I want to look more into the families that are differentially abundant)
- HUMAnN3 taxonomy
- Functional profiling using HUMAnN3
- Functional profiling using MEGAHIT and anvi’o
- Some kind of functional richness?

Basic processing steps

Obtaining reads

Fastq.gz files were downloaded from the ENA database, project accession number PRJNA523344

Kneaddata

Kneaddata was used for quality control and removal of human sequences. This included:
- Trimmomatic 0.39: “SLIDINGWINDOW:4:20 MINLEN:50”
- Bowtie2 with the GRCh38_PhiX database (to remove human and PhiX reads): “–fast –dovetail”

Reads kept after Kneaddata

#3 
#set up colors function (to get up to 120 colors, but with up to 40 unique colors)
def get_cols(num):
    colormap_20, colormap_40b, colormap_40c = mpl.cm.get_cmap('tab20', 256), mpl.cm.get_cmap('tab20b', 256), mpl.cm.get_cmap('tab20c', 256)
    norm, norm2 = mpl.colors.Normalize(vmin=0, vmax=19), mpl.colors.Normalize(vmin=20, vmax=39)
    m1, m2, m3 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap_20), mpl.cm.ScalarMappable(norm=norm, cmap=colormap_40b), mpl.cm.ScalarMappable(norm=norm2, cmap=colormap_40c)
    colors_20 = [m1.to_rgba(a) for a in range(20)]
    colors_40 = [m2.to_rgba(a) for a in range(20)]+[m3.to_rgba(a) for a in range(20,40)]
    if num < 21: return colors_20
    elif num < 41: return colors_40
    else: return colors_40+colors_40+colors_40
#and colors and shapes for different participants and body sites
colors_dict, shapes_dict = {'Blood':'#900C3F', 'Saliva':'#016F85', 'Buccal':'#ff8300', 'Saliva_euk':'#02aed1', 'Buccal_euk':'#FFC300'}, {'Huref':'o', 'PGPC-0002':'^', 'PGPC-0005':'*', 'PGPC-0006':'s', 'PGPC-0050':'p'}
#4
#get numbers of reads for different steps
reads = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/read_counts.txt', sep='\t', index_col=3, header=0)
participant_dict, site_dict, full_name_dict = {}, {}, {}
samples = list(reads.index.values)
for s in samples:
    participant_dict[s] = reads.loc[s, 'Participant']
    site_dict[s] = reads.loc[s, 'Body site']
    full_name_dict[s] = reads.loc[s, 'Participant']+' '+reads.loc[s, 'Body site']
total_reads = pd.DataFrame(reads.loc[:, 'cat_reads'])
sample_names = [participant_dict[name]+' '+site_dict[name] for name in samples]
colors = [colors_dict[s] for s in list(reads.loc[:, 'Body site'].values)]
shapes = [shapes_dict[s] for s in list(reads.loc[:, 'Participant'].values)]

Percentage reads remaining

#5
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Reads kept (%)')
plt.xlim([-0.5,20.5])
plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads kept (%)')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()

Number of reads remaining

#6
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.xlim([-0.5,20.5])
plt.ylabel('Reads remaining')

plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads remaining')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()

Table of reads remaining

#7
reads_remain = reads.loc[:, ['Percentage', 'cat_reads']].rename(index=full_name_dict, columns = {'cat_reads':'Number'})
#8
py$reads_remain %>%
  kable() %>%
  kable_styling()
Percentage Number
Huref Blood 0.2319067 1087311
PGPC-0002 Blood 0.5748318 3321664
PGPC-0005 Blood 0.5149286 1903482
PGPC-0006 Blood 0.7146067 3593441
PGPC-0050 Blood 0.6870164 2940390
PGPC-0002 Saliva 3.9289510 16561343
PGPC-0005 Saliva 55.6458826 244058333
PGPC-0006 Saliva 13.1608121 60497393
PGPC-0050 Saliva 56.3700720 257049004
PGPC-0002 Buccal 2.8112226 11462781
PGPC-0005 Buccal 2.8202504 12751614
PGPC-0006 Buccal 5.1521560 22434503
PGPC-0050 Buccal 2.4297107 10667011
PGPC-0002 Saliva_euk 1.2333408 5345888
PGPC-0005 Saliva_euk 8.3104464 38138382
PGPC-0006 Saliva_euk 1.9067564 8786635
PGPC-0050 Saliva_euk 5.6187272 25037552
PGPC-0002 Buccal_euk 0.9385856 4380036
PGPC-0005 Buccal_euk 1.2119820 5584415
PGPC-0006 Buccal_euk 1.5567291 7232280
PGPC-0050 Buccal_euk 1.3836141 6382288

Taxonomic profiling

The taxonomy has been profiled using:
1. HUMAnN
- HUMAnN2 and MetaPhlAn2
- HUMAnN3 and MetaPhlAn3
2. Kraken2 with Bracken
- GTDB (no confidence parameter set) -
using the database constructed using Struo, release 89 - GTDB (confidence = 0.1)
- Minikraken v1 (no human genome, no confidence parameter set)
- Minikraken v1 (no human genome, confidence = 0.1)
- Minikraken v2 (with human genome, no confidence parameter set)
- Minikraken v2 (with human genome, confidence = 0.1)
- RefSeq Complete v93 (no confidence parameter set)
- RefSeq Complete v93 (confidence = 0.1)

MetaPhlAn2 taxonomy nMDS plots

#9
#get the taxonomy file and sort it to strain and genus level
taxa = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/metaphlan_merged.tsv', sep='\t', header=0, index_col=0)
tax_names = list(taxa.index.values)
keeping = []
for a in range(len(tax_names)):
    if 't__' in tax_names[a]:
        keeping.append(True)
    elif 'unclassified' in tax_names[a]:
        keeping.append(True)
    else:
        keeping.append(False)
strain = taxa.loc[keeping, :]
strain_names = list(strain.index.values)
strain_dict = {}
for i in range(len(strain_names)):
    strain_dict[strain_names[i]] = strain_names[i].split('|s__')[0].split('|g__')[1]
genus = strain.rename(index=strain_dict)
genus = genus.groupby(by=genus.index, axis=0).sum()
#10
#define the function that calculates the nmds plots

def transform_for_NMDS(df, dist_met='braycurtis'):
    X = df.iloc[0:].values
    y = df.iloc[:,0].values
    seed = np.random.RandomState(seed=3)
    X_true = X
    similarities = distance.cdist(X_true, X_true, dist_met)
    mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
                   dissimilarity="precomputed", n_jobs=1)
    #print(similarities)
    pos = mds.fit(similarities).embedding_
    nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
                        dissimilarity="precomputed", random_state=seed, n_jobs=1,
                        n_init=1)
    npos = nmds.fit_transform(similarities, init=pos)
    # Rescale the data
    pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
    npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())
    # Rotate the data
    clf = PCA()
    X_true = clf.fit_transform(X_true)
    pos = clf.fit_transform(pos)
    npos = clf.fit_transform(npos)
    return pos, npos, nmds.stress_
strain_t = strain.transpose()
genus_t = genus.transpose()

handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]

Bray-Curtis distance

#11
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'braycurtis')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'braycurtis')

plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
    ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
    ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a],  color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')

ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()

Euclidean distance

#12
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'euclidean')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'euclidean')

plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
    ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
    ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a],  color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')

ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()

Jaccard distance

#13
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'jaccard')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'jaccard')

plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
    ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
    ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a],  color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')

ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()

MetaPhlAn2 taxonomy relative abundance

Kingdom

Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Kingdom level for each sample.

#14
plt.figure(figsize=(7,5))
ax1 = plt.subplot(111)
plt.bar(list(taxa.columns.values), taxa.loc['k__Viruses', :].values, color='#C70039', edgecolor='k')
plt.bar(list(taxa.columns.values), taxa.loc['k__Bacteria', :].values, bottom=taxa.loc['k__Viruses', :].values, color='#026B81', edgecolor='k')
#plt.xticks(list(taxa.columns.values), sample_names, rotation=90)
empty = []
for x in range(0,21):
    empty.append('')
    ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
handles = [Patch(facecolor='#C70039', edgecolor='k', label='Viruses'), Patch(facecolor='#026B81', edgecolor='k', label='Bacteria')]
plt.legend(handles=handles, bbox_to_anchor=(1,1.05))
plt.tight_layout()
plt.show()

Genus

Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Genus level for each sample. Genera with below 1% maximum relative abundance have been removed.

#15
genus = genus[genus.max(axis=1) > 1]
genera = list(genus.index.values)
plt.figure(figsize=(10,5))
ax1 = plt.subplot(111)
gen_colors = get_cols(len(genus.index.values))
handles = []
for g in range(len(genera)):
    this_gen = genus.loc[genera[g], :].values
    if g == 0:
        ax1.bar(list(genus.columns.values), this_gen, color=gen_colors[g], edgecolor='k')
        total = this_gen
    else:
        ax1.bar(list(genus.columns.values), this_gen, bottom=total, color=gen_colors[g], edgecolor='k')
        total = this_gen+total
    handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=genera[g]))
empty = []
for x in range(0,21):
    empty.append('')
    ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
plt.legend(handles=handles, bbox_to_anchor=(1,1.05), ncol=2)
plt.tight_layout()
plt.show()

MetaPhlAn3 taxonomy nMDS plots

Bray-Curtis distance

Euclidean distance

Jaccard distance

MetaPhlAn3 taxonomy relative abundance

Kingdom

Genus

Kraken2 reads classified

#16
#get all samples into dataframes based on the database that they use
folders = sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2'))
del folders[0]
kraken_columns = {0:'Percent fragments clade', 1:'Number fragments clade', 2:'Number fragments taxon', 3:'Level', 4:'NCBI ID', 5:'Taxon name'}
kraken_all_db, bracken_all_db, all_domains = [], [], {}
for fol in folders:
    if fol == 'db_genera' or fol == 'db_genera_in_common':
      continue
    if not os.path.isdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol):
        continue
    bracken, kraken, bracken_kreport = [], [], []
    bracken_pd, kraken_pd = [], []
    for fi in sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol)):
        if fi[-7:] == 'bracken':
            bracken.append(fi)
        elif fi[-7:] == 'kreport' and 'bracken' not in fi:
            kraken.append(fi)
        elif fi[-7:] == 'kreport':
            bracken_kreport.append(fi)
    for bk in bracken_kreport:
        with open('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+bk, 'rU') as f:
            bk = []
            domains = {}
            this_domain, domain_name = [], ''
            for row in csv.reader(f, delimiter='\t'):
                bk.append(row)
                row[5] = row[5].lstrip()
                if row[3] == 'D':
                    if domain_name != '':
                        domains[domain_name] = this_domain
                    this_domain, domain_name = [], row[5]
                else:
                    if row[3] != 'R' and row[3] != 'U' and 'D' not in row[3]:
                        this_domain.append(row[5])
            domains[domain_name] = this_domain
            for domain in domains:
                if domain in all_domains:
                    all_domains[domain] = list(set(all_domains[domain]+domains[domain]))
                else:
                    all_domains[domain] = list(set(domains[domain]))
    for b in bracken:
        if len(b) > 22:
            continue
        sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+b, sep='\t', header=0, index_col=0)
        b = b.replace('_150', '')
        sample.drop(['taxonomy_id', 'taxonomy_lvl', 'kraken_assigned_reads', 'added_reads', 'fraction_total_reads'], axis=1, inplace=True)
        sample.rename(columns={'new_est_reads':b[:-8]}, inplace=True)
        bracken_pd.append(sample)
    for k in kraken:
        sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+k, sep='\t', header=None, index_col=3)
        sample = sample.loc[['U', 'D'], :]
        sample = sample.rename(columns=kraken_columns).drop(['Number fragments taxon', 'NCBI ID'], axis=1).rename(columns={'Percent fragments clade':k[:-8]+'_percent', 'Number fragments clade':k[:-8]+'_reads'}).set_index('Taxon name')
        taxa = list(sample.index.values)
        taxa_dict = {}
        for t in taxa:
            taxa_dict[t] = t.replace(' ', '')
        sample = sample.rename(index=taxa_dict)
        kraken_pd.append(sample)
    bracken = pd.concat(bracken_pd, join='outer')
    kraken = pd.concat(kraken_pd, join='outer')
    kraken = kraken.rename(index={'d__Bacteria':'Bacteria', 'd__Archaea':'Archaea'})
    kraken = kraken.groupby(by=kraken.index, axis=0).sum()
    bracken = bracken.groupby(by=bracken.index, axis=0).sum().fillna(value=0)
    kraken_all_db.append(kraken), bracken_all_db.append(bracken)
#17
x1 = [x for x in range(21)]
x2 = [x+0.3 for x in range(21)]
tax_plotting = ['Archaea', 'Bacteria', 'Eukaryota', 'Viruses', 'unclassified']
color_plotting = ['#EDBB99', '#5499C7', '#7DCEA0', '#F7DC6F', '#CCD1D1']
tax_paper = ['Bacteria', 'Eukaryota', 'Other', 'Unclassified']
color_paper = ['#5499C7', '#7DCEA0', '#CD6155', '#CCD1D1']
from_paper = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/from_paper.csv', header=0, index_col=0)

def get_summary_reads(kraken_db):
    fig = plt.figure(figsize=(15,15))
    ax1, ax2, ax3, ax4, ax5 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325)
    ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
    ax5.set_title('From paper')
    ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
    x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
    axs = [ax1, ax2, ax3, ax4, ax5]
    for s in range(len(samples)):
        if s == 0:
            continue
        bottom = 0
        for t in range(len(tax_paper)):
            ax5.bar(x1[s], from_paper.loc[tax_paper[t], samples[s]], bottom=bottom, color=color_paper[t], edgecolor='k', width=0.6)
            bottom += from_paper.loc[tax_paper[t], samples[s]]
    for db in range(len(kraken_db)):
        ax_using = ax_plot[db]
        x = x_plot[db]
        db = kraken_db[db]
        handles = []
        for tax in range(len(tax_plotting)):
            handles.append(Patch(facecolor=color_plotting[tax], edgecolor='k', label=tax_plotting[tax]))
            tax = tax_plotting[tax]
            if tax not in list(db.index.values):
                db.loc[tax] = [0 for i in range(db.shape[1])]
        handles.append(Patch(facecolor=color_paper[2], edgecolor='k', label='Other'))
        db = db.fillna(value=0)
        for s in range(len(samples)):
            bottom = 0
            for t in range(len(tax_plotting)):
                prop = db.loc[tax_plotting[t], samples[s]+'_reads']
                cat = total_reads.loc[samples[s], 'cat_reads']
                prop = (prop/cat)*100
                ax_using.bar(x[s], prop, bottom=bottom, color=color_plotting[t], edgecolor='k', width=0.3)
                bottom += prop
    ax2.legend(handles=handles, bbox_to_anchor=(1,1.05))
    for ax in axs:
        plt.sca(ax)
        plt.xticks(x1, ['' for x in x1])
        plt.ylim([0, 100])
        plt.xlim([-0.5, 20.5])
    for x in x1:
        ax5.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
        ax4.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
    ax1.set_ylabel('Classified (%)'), ax3.set_ylabel('Classified (%)'), ax5.set_ylabel('Classified(%)')
    #plt.tight_layout()
    return

def get_summary_bacteria(kraken_db):
    tax_plotting = ['Bacteria']
    alpha = ['#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F']
    
    fig = plt.figure(figsize=(10,8))
    ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
    ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
    ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
    x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
    axs = [ax1, ax2, ax3, ax4]
    
    for db in range(len(kraken_db)):
        ax_using = ax_plot[db]
        x = x_plot[db]
        alp = alpha[db]
        db = kraken_db[db]
        for tax in range(len(tax_plotting)):
            tax = tax_plotting[tax]
            if tax not in list(db.index.values):
                db.loc[tax] = [0 for i in range(db.shape[1])]
        db = db.fillna(value=0)
        for s in range(len(samples)):
            bottom = 0
            for t in range(len(tax_plotting)):
                prop = db.loc[tax_plotting[t], samples[s]+'_reads']
                cat = total_reads.loc[samples[s], 'cat_reads']
                ax_using.bar(x[s], prop, bottom=bottom, color=alp, edgecolor='k', width=0.3)
                bottom += prop
    handles = []
    handles.append(Patch(facecolor=alpha[0], edgecolor='k', label='No confidence value'))
    handles.append(Patch(facecolor=alpha[1], edgecolor='k', label='Confidence=0.1'))
    ax2.legend(handles=handles, bbox_to_anchor=(1.6,1.03))
    for ax in axs:
        plt.sca(ax)
        plt.xticks(x1, ['' for x in x1])
        plt.semilogy()
        plt.xlim([-0.5, 20.5])
        #plt.ylim([0, 100])
        plt.xlim([-0.5, 20.5])
    for x in x1:
        pl = ((1/21)*(x+1))-0.02
        ax3.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax3.transAxes)
        ax4.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax4.transAxes)
    ax1.set_ylabel('Number of reads'), ax3.set_ylabel('Number of reads')
    #plt.tight_layout()
    return

Percent classified

A summary of the percentage of reads classified as different domains with different databases. Note that the ‘From paper’ plot uses the classifications given in the original paper, where 10,000 unmapped reads were classified using BLAST searches of the NCBI database.

#18
get_summary_reads(kraken_all_db)
plt.show()

Summary of number of bacteria

Summary of the number of reads that are classified as bacteria by each database.

#19
get_summary_bacteria(kraken_all_db)
plt.tight_layout()
plt.show()

Kraken2 taxonomy nMDS plots

These first plots are all separately with the confidence parameter set. See the last tab for those without the confidence parameter set.

#20
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []

for db in range(len(bracken_all_db)):
    d = int(db)
    db = bracken_all_db[db]
    species = list(db.index.values)
    keeping = []
    species_dict = {}
    for sp in species:
        if sp in bacteria:
            keeping.append(True)
            new_sp = sp.split('__')
            if len(new_sp) > 1:
                new_sp = new_sp[1]
            else:
                new_sp = new_sp[0]
            species_dict[sp] = new_sp.split(' ')[0].replace("'", '')
        else:
            keeping.append(False)
    in_len = db.shape[0]
    db = db.loc[keeping, :]
    strain.append(db)
    db = db.rename(index=species_dict)
    db = db.groupby(by=db.index, axis=0).sum()
    sums = db.sum(axis=0)
    gen_sums.append(sums)
    db = db.divide(sums, axis=1).multiply(100)
    genera.append(db)
    db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/'+db_names[d]+'.csv')
    gen_names = gen_names+list(db.index.values)
    db = db[db.max(axis=1) > 1]
    genera_1.append(db)
    gen_names_1 = gen_names_1+list(db.index.values)
gen_names = list(set(gen_names))
gen_names_1 = list(set(gen_names_1))
#21
def plot_four_nmds(dbs, metric, name):
    fig = plt.figure(figsize=(15,10))
    #fig.suptitle(name+metric+'\n\n\n')
    ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
    axs = [ax3, ax1, ax2, ax4]
    ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
    
    for db in range(len(dbs)):
        n = db
        db = dbs[db].transpose()
        pos, npos, stress = transform_for_NMDS(db, metric)
        for a in range(len(npos)):
            axs[n].scatter(npos[a,0], npos[a,1], marker=shapes[a], color=colors[a], s=100, edgecolor='k')
        axs[n].set_xlabel('nMDS1')
        axs[n].set_ylabel('nMDS2')
    handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
    handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]

    ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
    plt.tight_layout()
    return

Bray-Curtis distance strain level

#22
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'braycurtis', 'NMDS confidence=0.1 strain ')
plt.show()

Bray-Curtis distance genus level

#23
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'braycurtis', 'NMDS confidence=0.1 genera ')
plt.show()

Euclidean distance strain level

#24
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'euclidean', 'NMDS confidence=0.1 strain ')
plt.show()

Euclidean distance genus level

#25
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'euclidean', 'NMDS confidence=0.1 genera ')
plt.show()

Jaccard distance strain level

#26
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'jaccard', 'NMDS confidence=0.1 strain ')
plt.show()

Jaccard distance genus level

#27
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'jaccard', 'NMDS confidence=0.1 genera ')
plt.show()

All plots with no confidence parameter set

Bray-curtis distance at strain level

#28
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'braycurtis', 'NMDS no confidence strain ')
plt.show()

Bray-curtis distance at genus level

#29
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'braycurtis', 'NMDS no confidence genera ')
plt.show()

Euclidean distance at strain level

#30
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'euclidean', 'NMDS no confidence strain ')
plt.show()

Euclidean distance at genus level

#31
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'euclidean', 'NMDS no confidence genera ')
plt.show()

Jaccard distance at strain level

#32
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'jaccard', 'NMDS no confidence strain ')
plt.show()

Jaccard distance at genus level

#33
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'jaccard', 'NMDS no confidence genera ')
plt.show()

Kraken2 taxonomy relative abundance

These plots are now only calculated for the classifications that used confidence = 0.1. Genera with below 1% maximum relative abundance are removed and the numbers in brackets are the number of reads that were classified as bacteria.

#34
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
#bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
#genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []
gen_names_1 = sorted(gen_names_1)
def plot_genera(db, sums, tax_cols, gen_names_1, dname):
    plt.figure(figsize=(10,5))
    ax1 = plt.subplot(111)
    bottom = [0 for x in range(len(db.columns))]
    handles = []
    for g in range(len(gen_names_1)):
        if gen_names_1[g] in db.index.values:
            this_row = db.loc[gen_names_1[g], :].values
            ax1.bar(x1, this_row, bottom=bottom, color=tax_cols[g], edgecolor='k')
            handles.append(Patch(facecolor=tax_cols[g], edgecolor='k', label=gen_names_1[g]))
            for b in range(len(bottom)):
                bottom[b] += this_row[b]
    ax1.legend(handles=handles, bbox_to_anchor=(1, 1.03), ncol=3)
    plt.xticks(x1, ['' for x in x1])
    plt.ylabel('Relative abundance(%)')
    plt.xlim([-0.5, 20.5])
    plt.ylim([0, 100])
    for x in x1:
        n = str(int(sums[samples[x]]))
        ax1.text(x, -2, sample_names[x]+' ('+n+')', color=colors[x], rotation=90, va='top', ha='center')
    plt.tight_layout()
    return

gen_plot = [genera_1[1], genera_1[3], genera_1[5], genera_1[7]]
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
all_sums = [gen_sums[1], gen_sums[3], gen_sums[5], gen_sums[7]]
tax_cols = get_cols(len(gen_names_1))

Minikraken v1

#35
plot_genera(gen_plot[1], all_sums[1], tax_cols, gen_names_1, db_name[1])
plt.show()

Minikraken v2

#36
plot_genera(gen_plot[2], all_sums[2], tax_cols, gen_names_1, db_name[2])
plt.show()

GTDB

#37
plot_genera(gen_plot[0], all_sums[0], tax_cols, gen_names_1, db_name[0])
plt.show()

RefSeq Complete v93

#38
plot_genera(gen_plot[3], all_sums[3], tax_cols, gen_names_1, db_name[3])
plt.show()

Kraken2 similarities and differences between body sites

Here I have carried out ANCOM2 tests for differential abundance of genera between body sites. All genera are then plotted on a heatmap with mean values for each body site and stars to denote significant differences as determined by ANCOM.

While many of these results vary between databases, it is worth noting that some differences are consistent, e.g.:
1. Prevotella are always more abundant in saliva samples
2. Streptococcus are always more abundant in buccal samples
3. Klebsiella (where present) are always more abundant in blood samples

#39
def get_differences(new_genus, db_name):
    new_genus = new_genus.drop(['SRR8595488'], axis=1)
    new_genus = new_genus[new_genus.max(axis=1) > 0.1]
    samples = list(new_genus.columns)
    
    list_comparisons = [['Blood', 'Saliva'], ['Blood', 'Buccal'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk'], ['Saliva_euk', 'Buccal_euk']]
    comparisons, metadata, comp_len = [], [], []
    for a in range(len(list_comparisons)):
        keeping = []
        this_md = []
        for b in range(len(samples)):
            if site_dict[samples[b]] in list_comparisons[a]:
                keeping.append(True)
                this_md.append([samples[b], site_dict[samples[b]]])
            else:
                keeping.append(False)
        this_comp = new_genus.loc[:, keeping]
        this_comp = this_comp[this_comp.max(axis=1) > 0.1]
        comparisons.append(this_comp)
        this_md = pd.DataFrame(this_md, columns=['Samples', 'Groups'])
        #this_md.set_index('Samples', inplace=True)
        metadata.append(this_md)
        comp_len.append(this_comp.shape[0])
    new_genus = new_genus.rename(columns=site_dict, inplace=False)
    return comparisons, metadata, new_genus, comp_len
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
comparison_names = [r'Blood vs saliva', r'Blood vs buccal', r'Saliva vs buccal', r'Saliva vs saliva_euk', r'Buccal vs buccal_euk', r'Blood vs saliva_euk', r'Blood vs buccal_euk', r'Saliva_euk vs buccal_euk']
source("/Users/robynwright/Documents/OneDrive/Github/R-notebooks/ancom_v2.1.R")

get_ancom <- function(ft, md) {
all_ancom = list()
for (a in 1:8){
  feature_table = ft[a]
  meta_data = md[a]
  process = feature_table_pre_process(feature_table, meta_data, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
  ancom_out = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
  all_ancom[[a]] <- ancom_out$out
}
return(all_ancom)
}
def sort_ancom_results(r_ancom):
  ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06 = [], [], [], []
  for a in range(len(r_ancom)):
    this_sig_09, this_sig_08, this_sig_07, this_sig_06 = [], [], [], []
    r_ancom[a].set_index('taxa_id', inplace=True)
    all_sp = list(r_ancom[a].index.values)
    for b in range(len(all_sp)):
      if r_ancom[a].loc[all_sp[b], 'detected_0.9'] == True:
        this_sig_09.append(all_sp[b])
      if r_ancom[a].loc[all_sp[b], 'detected_0.8'] == True:
        this_sig_08.append(all_sp[b])
      if r_ancom[a].loc[all_sp[b], 'detected_0.7'] == True:
        this_sig_07.append(all_sp[b])
      if r_ancom[a].loc[all_sp[b], 'detected_0.6'] == True:
        this_sig_06.append(all_sp[b])
    ancom_lists_09.append(this_sig_09), ancom_lists_08.append(this_sig_08), ancom_lists_07.append(this_sig_07), ancom_lists_06.append(this_sig_06)
  return [ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06]
def plot_heatmap(new_genus, ANCOM):
    print(ANCOM)
    names = ['Blood', 'Saliva', 'Buccal', 'Saliva_euk', 'Buccal_euk']
    other_names = ['Abundance', r'Blood $vs$ saliva', r'Blood $vs$ buccal', r'Saliva $vs$ buccal', r'Saliva $vs$ saliva_euk', r'Buccal $vs$ buccal_euk', r'Blood $vs$ saliva_euk', r'Blood $vs$ buccal_euk', r'Saliva_euk $vs$ buccal_euk']
    colormap, norm = mpl.cm.get_cmap('plasma', 256), mpl.colors.Normalize(vmin=0, vmax=1)
    m = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
    if list(new_genus.index.values)[0][0] == 'A':
        new_genus = new_genus.iloc[::-1]
    figure = plt.figure(figsize=(5,new_genus.shape[0]*0.2))
    ax1 = plt.subplot(111)
    genus_names = list(new_genus.index.values)
    y = []
    for g in range(len(genus_names)):
        this_row = new_genus.loc[genus_names[g], :]
        values = [(np.mean(this_row[name].values)) for name in names]
        bottom, top, x = [g for a in range(5)], [1 for a in range(5)], [a for a in range(5)]
        y.append(g+0.5)
        ma = max(values)
        values = [v/ma for v in values]
        values = [m.to_rgba(v) for v in values]
        ax1.bar(x, top, bottom=bottom, color=values, edgecolor='k', width=1)
        x.append(5)
        ax1.scatter(x[-1], bottom[-1]+0.5, color='k', s=ma*5)
        sig, x_plt = [], x[-1]
        for a in range(len(ANCOM)):
            x_plt += 0.75
            if genus_names[g] in ANCOM[a]: ax1.scatter(x_plt, bottom[-1]+0.5, marker='*', color='k')
            x.append(x_plt)
    plt.ylim([0, len(genus_names)]), plt.xlim([-0.5, x[-1]+0.5])
    plt.yticks(y, genus_names)
    plt.xticks(x, names+other_names, rotation=90)
    ax1.xaxis.set_ticks_position('top')
    plt.tight_layout()
    plt.show()
    return

Minikraken v1 ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 41 41 37 26 8
Blood vs buccal 37 27 21 15 3
Saliva vs buccal 45 3 3 2 1
Saliva vs saliva_euk 41 0 0 0 0
Buccal vs buccal_euk 39 2 0 0 0
Blood vs saliva_euk 38 32 24 15 3
Blood vs buccal_euk 34 11 5 2 2
Saliva_euk vs buccal_euk 43 5 3 2 1

Minikraken v1 heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])
## [['Burkholderia', 'Enterobacter', 'Klebsiella', 'Paracoccus', 'Pasteurella', 'Prevotella', 'Rothia', 'Staphylococcus'], ['Burkholderia', 'Rothia', 'Streptococcus'], ['Prevotella'], [], [], ['Burkholderia', 'Prevotella', 'Rothia'], ['Rothia', 'Streptococcus'], ['Prevotella']]
## 
## <string>:9: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).

Minikraken v2 ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_human_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 49 48 47 38 9
Blood vs buccal 48 35 16 14 4
Saliva vs buccal 39 5 4 1 0
Saliva vs saliva_euk 30 0 0 0 0
Buccal vs buccal_euk 37 0 0 0 0
Blood vs saliva_euk 49 47 33 22 5
Blood vs buccal_euk 50 19 14 12 4
Saliva_euk vs buccal_euk 42 4 3 3 2

Minikraken v2 heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])
## [['Capnocytophaga', 'Clostridium', 'Klebsiella', 'Mogibacterium', 'Prevotella', 'Pseudomonas', 'Rothia', 'Streptomyces', 'Veillonella'], ['Klebsiella', 'Rothia', 'Streptococcus', 'Streptomyces'], [], [], [], ['Campylobacter', 'Klebsiella', 'Prevotella', 'Pseudomonas', 'Streptomyces'], ['Klebsiella', 'Rothia', 'Streptococcus', 'Streptomyces'], ['Prevotella', 'Streptococcus']]

GTDB ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/gtdb_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 98 88 82 32 6
Blood vs buccal 91 21 14 8 5
Saliva vs buccal 96 18 10 2 1
Saliva vs saliva_euk 93 3 2 0 0
Buccal vs buccal_euk 95 8 5 1 0
Blood vs saliva_euk 92 27 17 11 4
Blood vs buccal_euk 75 7 7 3 2
Saliva_euk vs buccal_euk 97 19 13 7 1

GTDB heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])
## [['Mycobacterium', 'Pauljensenia', 'Prevotella', 'Psychrobacter', 'Streptococcus', 'Veillonella'], ['Actinomyces', 'Prevotella', 'Rothia', 'Streptococcus', 'Veillonella'], ['Prevotella'], [], [], ['Acinetobacter', 'Prevotella', 'Streptococcus', 'Veillonella'], ['Acinetobacter', 'Poseidonibacter'], ['Prevotella']]

RefSeq Complete v93 ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/refseq_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 85 81 77 35 11
Blood vs buccal 82 65 36 21 7
Saliva vs buccal 69 9 6 3 0
Saliva vs saliva_euk 64 0 0 0 0
Buccal vs buccal_euk 76 4 4 1 0
Blood vs saliva_euk 86 72 37 20 4
Blood vs buccal_euk 79 18 15 8 2
Saliva_euk vs buccal_euk 84 12 10 5 1

RefSeq Complete v93 heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])
## [['Bordetella', 'Enterococcus', 'Escherichia', 'Klebsiella', 'Mesorhizobium', 'Mycobacterium', 'Prevotella', 'Pseudomonas', 'Salmonella', 'Staphylococcus', 'Veillonella'], ['Actinomyces', 'Mycobacterium', 'Prevotella', 'Pseudomonas', 'Rothia', 'Streptococcus', 'Veillonella'], [], [], [], ['Mycobacterium', 'Prevotella', 'Pseudomonas', 'Veillonella'], ['Actinomyces', 'Veillonella'], ['Prevotella']]

Kraken2 intersection between databases

Now we are looking at the taxa (grouped to genus level, to hopefully allow for differences between databases) that are in common between the different databases that have been used with Kraken2. We are also using only the versions that have confidence=0.1.

Genera common to all databases (reads before and after)

The plots below here show the number of reads in each sample before and after removal of the genera that aren’t present in all databases.

#print(bracken_all_db)
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3

conf_bracken_genus = []
genus_in_each = []
for a in range(len(bracken_all_db)):
  if a not in [0, 2, 4, 6]:
    continue
  this_db = pd.DataFrame(bracken_all_db[a])
  taxa = list(this_db.index.values)
  tax_dict = {}
  for b in range(len(taxa)):
    orig_tax = taxa[b]
    if 's__' in taxa[b]:
      taxa[b] = taxa[b].replace('s__', '')
    taxa[b] = taxa[b].split(' ')[0]
    taxa[b] = taxa[b].replace("'", "")
    tax_dict[orig_tax] = taxa[b]
  this_db.rename(index=tax_dict, inplace=True)
  genus = this_db.groupby(by=this_db.index, axis=0).sum()
  conf_bracken_genus.append(genus)
  genus_in_each.append(list(genus.index.values))

[print(len(gen)) for gen in genus_in_each]
## 7452
## 1717
## 1304
## 4736
## [None, None, None, None]
overall_genus = intersection(genus_in_each[0], genus_in_each[1])
overall_genus = intersection(overall_genus, genus_in_each[2])
overall_genus = intersection(overall_genus, genus_in_each[3])
print(len(overall_genus))
## 1054
fig = plt.figure(figsize=(10,6))
ax = [plt.subplot(223), plt.subplot(221), plt.subplot(222), plt.subplot(224)]
x1 = [x for x in range(21)]
x2 = [x+0.4 for x in range(21)]
x3 = [x+0.2 for x in range(21)]

conf_bracken_overall = []
names = ['gtdb', 'minikrakenv1', 'minikrakenv2', 'refseq']
labels = ['GTDB', 'Minikraken v1 (without human)', 'Minikraken v2 (with human)', 'RefSeq Complete v93']
sum_reduced = []
for db in range(len(conf_bracken_genus)):
  new_db = pd.DataFrame(conf_bracken_genus[db].loc[overall_genus, :])
  conf_bracken_overall.append(new_db)
  new_db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera_in_common/'+names[db]+'_common.csv')
  conf_bracken_genus[db].to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera_in_common/'+names[db]+'.csv')
  sums = new_db.sum(axis=0)
  sum_reduced.append(sums)
  ax[db].bar(x2, sums, color=colors, edgecolor='k', width=0.4, alpha=0.5)
  sums = conf_bracken_genus[db].sum(axis=0)
  ax[db].bar(x1, sums, color=colors, edgecolor='k', width=0.4)
  ax[db].semilogy()
  ax[db].set_title(labels[db])
  plt.sca(ax[db])
  if db == 1 or db == 2:
    plt.xticks([])
  else:
    plt.xticks(x3, sample_names, rotation=90)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## []
## Text(0.5, 1.0, 'GTDB')
## ([<matplotlib.axis.XTick object at 0x1262b1100>, <matplotlib.axis.XTick object at 0x1262b10d0>, <matplotlib.axis.XTick object at 0x1262906d0>, <matplotlib.axis.XTick object at 0x125df3760>, <matplotlib.axis.XTick object at 0x125df3b50>, <matplotlib.axis.XTick object at 0x125dc4130>, <matplotlib.axis.XTick object at 0x125dc46d0>, <matplotlib.axis.XTick object at 0x125dc4c70>, <matplotlib.axis.XTick object at 0x125dc3250>, <matplotlib.axis.XTick object at 0x125dc37f0>, <matplotlib.axis.XTick object at 0x125dc3d90>, <matplotlib.axis.XTick object at 0x125dcf370>, <matplotlib.axis.XTick object at 0x125dcf910>, <matplotlib.axis.XTick object at 0x125dc32e0>, <matplotlib.axis.XTick object at 0x125dc4190>, <matplotlib.axis.XTick object at 0x125dcf460>, <matplotlib.axis.XTick object at 0x125dd1130>, <matplotlib.axis.XTick object at 0x125dd16d0>, <matplotlib.axis.XTick object at 0x125dd1c70>, <matplotlib.axis.XTick object at 0x125dc0250>, <matplotlib.axis.XTick object at 0x125dc07f0>], <a list of 21 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## []
## Text(0.5, 1.0, 'Minikraken v1 (without human)')
## ([], <a list of 0 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## []
## Text(0.5, 1.0, 'Minikraken v2 (with human)')
## ([], <a list of 0 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## []
## Text(0.5, 1.0, 'RefSeq Complete v93')
## ([<matplotlib.axis.XTick object at 0x125d29a00>, <matplotlib.axis.XTick object at 0x125d299d0>, <matplotlib.axis.XTick object at 0x125d48fd0>, <matplotlib.axis.XTick object at 0x12a8fea60>, <matplotlib.axis.XTick object at 0x12a8f9040>, <matplotlib.axis.XTick object at 0x12a8f95e0>, <matplotlib.axis.XTick object at 0x12a8f9b80>, <matplotlib.axis.XTick object at 0x12a8e5160>, <matplotlib.axis.XTick object at 0x12a8e5700>, <matplotlib.axis.XTick object at 0x12a8f97c0>, <matplotlib.axis.XTick object at 0x12a8e5ac0>, <matplotlib.axis.XTick object at 0x12a8e5df0>, <matplotlib.axis.XTick object at 0x12a9073d0>, <matplotlib.axis.XTick object at 0x12a907970>, <matplotlib.axis.XTick object at 0x12a907f10>, <matplotlib.axis.XTick object at 0x12a8e44f0>, <matplotlib.axis.XTick object at 0x12a8e4a90>, <matplotlib.axis.XTick object at 0x12a8f5070>, <matplotlib.axis.XTick object at 0x12a8f5610>, <matplotlib.axis.XTick object at 0x12a907fa0>, <matplotlib.axis.XTick object at 0x12a8e5340>], <a list of 21 Text xticklabel objects>)
ax[0].set_ylabel('Number of reads'), ax[1].set_ylabel('Number of reads')
## (Text(0, 0.5, 'Number of reads'), Text(0, 0.5, 'Number of reads'))
handles = [Patch(facecolor='k', edgecolor='k', label='All genera'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Genera present in all')]
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()

Genera common to all databases (reads remaining)

fig = plt.figure(figsize=(8,4))
ax = plt.subplot(111)
x1 = [x*5 for x in range(21)]
x2 = [x+1 for x in x1]
x3 = [x+1 for x in x2]
x4 = [x+1 for x in x3]
xplt = [x+0.5 for x in x2]

x = [x3, x1, x2, x4]
al = [1, 0.8, 0.6, 0.4]
for db in range(len(sum_reduced)):
  ax.bar(x[db], sum_reduced[db], color=colors, width=1, edgecolor='k', alpha=al[db])
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
handles = [Patch(facecolor='k', edgecolor='k', label='Minikraken v1'), Patch(facecolor='k', edgecolor='k', alpha=0.8, label='Minikraken v2'), Patch(facecolor='k', edgecolor='k', alpha=0.6, label='GTDB'), Patch(facecolor='k', edgecolor='k', alpha=0.4, label='RefSeq Complete v93')]
ax.legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
plt.xticks(xplt, sample_names, rotation=90)
## ([<matplotlib.axis.XTick object at 0x1287cea00>, <matplotlib.axis.XTick object at 0x1287cebb0>, <matplotlib.axis.XTick object at 0x1287d4640>, <matplotlib.axis.XTick object at 0x12a8e4670>, <matplotlib.axis.XTick object at 0x12a907f10>, <matplotlib.axis.XTick object at 0x12a8f98e0>, <matplotlib.axis.XTick object at 0x12a8e5460>, <matplotlib.axis.XTick object at 0x12a8e5340>, <matplotlib.axis.XTick object at 0x128e68c70>, <matplotlib.axis.XTick object at 0x12a8e5a00>, <matplotlib.axis.XTick object at 0x12a907250>, <matplotlib.axis.XTick object at 0x127c257f0>, <matplotlib.axis.XTick object at 0x127c25670>, <matplotlib.axis.XTick object at 0x128e77940>, <matplotlib.axis.XTick object at 0x126286a00>, <matplotlib.axis.XTick object at 0x126286d30>, <matplotlib.axis.XTick object at 0x1262b11c0>, <matplotlib.axis.XTick object at 0x1262b1f70>, <matplotlib.axis.XTick object at 0x128e7a280>, <matplotlib.axis.XTick object at 0x1262b1a00>, <matplotlib.axis.XTick object at 0x128e77190>], <a list of 21 Text xticklabel objects>)
plt.semilogy()
## []
plt.ylabel('Number of reads')
plt.tight_layout()
plt.show()

### Genera stacked bars (absolute abundance)

for db in range(len(conf_bracken_overall)):
  rename_samples = {}
  for sample in list(conf_bracken_overall[db].columns):
    rename_samples[sample] = sample+'_'+names[db]
  this_db = pd.DataFrame(conf_bracken_overall[db].rename(columns=rename_samples))
  if db == 0:
    overall_genus = this_db
  else:
    overall_genus = pd.concat([overall_genus, this_db], axis=1)
overall_genus = pd.DataFrame(overall_genus)
#overall_genus = pd.DataFrame(overall_genus.divide(overall_genus.sum(axis=0)).multiply(100))
overall_genus = overall_genus[overall_genus.max(axis=1) > 25000]
sums = overall_genus.sum(axis=0)
gen_colors = get_cols(overall_genus.shape[0])
print(overall_genus)
##                SRR8595488_gtdb  ...  SRR8595509_refseq
## name                            ...                   
## Achromobacter           6731.0  ...               11.0
## Acidisphaera             602.0  ...                3.0
## Acinetobacter           5068.0  ...             4597.0
## Actinomyces              340.0  ...             4126.0
## Aeromonas                802.0  ...                0.0
## ...                        ...  ...                ...
## Veillonella               18.0  ...            20951.0
## Vibrio                   232.0  ...             1978.0
## Weissella                 24.0  ...                3.0
## Xanthomonas               14.0  ...              227.0
## Xylanimonas                0.0  ...                2.0
## 
## [112 rows x 84 columns]
fig = plt.figure(figsize=(14,10))
ax1 = plt.subplot(223)
ax = [ax1, plt.subplot(221, sharey=ax1), plt.subplot(222, sharey=ax1), plt.subplot(224, sharey=ax1)]
x1 = [x for x in range(21)]
a = 0
col_samples = list(overall_genus.columns)
#line 20
new_db = []
for name in names:
  keeping = []
  for s in list(overall_genus.columns):
    if name in s:
      keeping.append(True)
    else:
      keeping.append(False)
  other_new_db = pd.DataFrame(overall_genus.loc[:, keeping])
  new_db.append(other_new_db)
for db in range(len(new_db)):
  this_genera = list(new_db[db].index.values)
  handles = []
  for g in range(len(this_genera)):
    if g == 0:
      bottom = [0 for c in x1]
    these_values = new_db[db].loc[this_genera[g], :].values
    ax[db].bar(x1, these_values, bottom=bottom, color=gen_colors[g], edgecolor='k', width=1)
    handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=this_genera[g]))
    bottom = [bottom[x]+these_values[x] for x in range(len(bottom))]
  plt.sca(ax[db])
  if db == 1 or db == 2:
    plt.xticks([])
  else:
    plt.xticks(x1, sample_names, rotation=90)
  plt.semilogy()
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([<matplotlib.axis.XTick object at 0x126845d30>, <matplotlib.axis.XTick object at 0x126845d00>, <matplotlib.axis.XTick object at 0x1268455b0>, <matplotlib.axis.XTick object at 0x12a67c910>, <matplotlib.axis.XTick object at 0x12a67ceb0>, <matplotlib.axis.XTick object at 0x12a686490>, <matplotlib.axis.XTick object at 0x12a686a30>, <matplotlib.axis.XTick object at 0x12a686fd0>, <matplotlib.axis.XTick object at 0x12a68b5b0>, <matplotlib.axis.XTick object at 0x12a68bb50>, <matplotlib.axis.XTick object at 0x12a690130>, <matplotlib.axis.XTick object at 0x12a68b6d0>, <matplotlib.axis.XTick object at 0x12a686160>, <matplotlib.axis.XTick object at 0x12a690490>, <matplotlib.axis.XTick object at 0x12a690a30>, <matplotlib.axis.XTick object at 0x12a690fd0>, <matplotlib.axis.XTick object at 0x12a6955b0>, <matplotlib.axis.XTick object at 0x12a695b50>, <matplotlib.axis.XTick object at 0x12a69a130>, <matplotlib.axis.XTick object at 0x12a69a6d0>, <matplotlib.axis.XTick object at 0x12a695790>], <a list of 21 Text xticklabel objects>)
## []
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([], <a list of 0 Text xticklabel objects>)
## []
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([], <a list of 0 Text xticklabel objects>)
## []
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([<matplotlib.axis.XTick object at 0x1294fa400>, <matplotlib.axis.XTick object at 0x1294fa430>, <matplotlib.axis.XTick object at 0x129504f40>, <matplotlib.axis.XTick object at 0x130bc47f0>, <matplotlib.axis.XTick object at 0x130bc4d90>, <matplotlib.axis.XTick object at 0x130bcc370>, <matplotlib.axis.XTick object at 0x130bcc910>, <matplotlib.axis.XTick object at 0x130bcceb0>, <matplotlib.axis.XTick object at 0x130bd1490>, <matplotlib.axis.XTick object at 0x130bd1a30>, <matplotlib.axis.XTick object at 0x130bd1fd0>, <matplotlib.axis.XTick object at 0x130bd1070>, <matplotlib.axis.XTick object at 0x130bc4f70>, <matplotlib.axis.XTick object at 0x130bd4370>, <matplotlib.axis.XTick object at 0x130bd4910>, <matplotlib.axis.XTick object at 0x130bd4eb0>, <matplotlib.axis.XTick object at 0x130bdb490>, <matplotlib.axis.XTick object at 0x130bdba30>, <matplotlib.axis.XTick object at 0x130bdbfd0>, <matplotlib.axis.XTick object at 0x130be15b0>, <matplotlib.axis.XTick object at 0x130bdb670>], <a list of 21 Text xticklabel objects>)
## []
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1), ncol=3)
plt.tight_layout()
plt.show()
## /Library/Frameworks/R.framework/Versions/3.6/Resources/library/reticulate/python/rpytools/call.py:13: UserWarning: Attempted to set non-positive bottom ylim on a log-scaled axis.
## Invalid limit will be ignored.
##   res = rpycall.call_r_function(f, *args, **kwargs)
## /Library/Frameworks/R.framework/Versions/3.6/Resources/library/reticulate/python/rpytools/call.py:13: UserWarning: Attempting to set identical bottom == top == 1.0 results in singular transformations; automatically expanding.
##   res = rpycall.call_r_function(f, *args, **kwargs)

### Genera stacked bars (relative abundance)

for db in range(len(conf_bracken_overall)):
  rename_samples = {}
  for sample in list(conf_bracken_overall[db].columns):
    rename_samples[sample] = sample+'_'+names[db]
  this_db = pd.DataFrame(conf_bracken_overall[db].rename(columns=rename_samples))
  if db == 0:
    overall_genus = this_db
  else:
    overall_genus = pd.concat([overall_genus, this_db], axis=1)
overall_genus = pd.DataFrame(overall_genus)
overall_genus = pd.DataFrame(overall_genus.divide(overall_genus.sum(axis=0)).multiply(100))
overall_genus = overall_genus[overall_genus.max(axis=1) > 1]
sums = overall_genus.sum(axis=0)
gen_colors = get_cols(overall_genus.shape[0])
print(overall_genus)
##                SRR8595488_gtdb  ...  SRR8595509_refseq
## name                            ...                   
## Achromobacter         3.474961  ...           0.004477
## Acinetobacter         2.616417  ...           1.870874
## Actinomyces           0.175529  ...           1.679188
## Alcanivorax           2.164171  ...           0.001628
## Amycolatopsis         0.926174  ...           0.015465
## ...                        ...  ...                ...
## Thermosipho           0.005679  ...           0.000407
## Treponema             0.001549  ...           0.015872
## Veillonella           0.009293  ...           8.526580
## Weissella             0.012390  ...           0.001221
## Xanthomonas           0.007228  ...           0.092384
## 
## [64 rows x 84 columns]
fig = plt.figure(figsize=(14,10))
ax1 = plt.subplot(223)
ax = [ax1, plt.subplot(221, sharey=ax1), plt.subplot(222, sharey=ax1), plt.subplot(224, sharey=ax1)]
x1 = [x for x in range(21)]
a = 0
col_samples = list(overall_genus.columns)
#line 20
new_db = []
for name in names:
  keeping = []
  for s in list(overall_genus.columns):
    if name in s:
      keeping.append(True)
    else:
      keeping.append(False)
  other_new_db = pd.DataFrame(overall_genus.loc[:, keeping])
  new_db.append(other_new_db)
for db in range(len(new_db)):
  this_genera = list(new_db[db].index.values)
  handles = []
  for g in range(len(this_genera)):
    if g == 0:
      bottom = [0 for c in x1]
    these_values = new_db[db].loc[this_genera[g], :].values
    ax[db].bar(x1, these_values, bottom=bottom, color=gen_colors[g], edgecolor='k', width=1)
    handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=this_genera[g]))
    bottom = [bottom[x]+these_values[x] for x in range(len(bottom))]
  plt.sca(ax[db])
  if db == 1 or db == 2:
    plt.xticks([])
  else:
    plt.xticks(x1, sample_names, rotation=90)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([<matplotlib.axis.XTick object at 0x1294d8370>, <matplotlib.axis.XTick object at 0x1294d8340>, <matplotlib.axis.XTick object at 0x132921910>, <matplotlib.axis.XTick object at 0x12a1c2850>, <matplotlib.axis.XTick object at 0x12a1c20a0>, <matplotlib.axis.XTick object at 0x127c4ca60>, <matplotlib.axis.XTick object at 0x127c4c910>, <matplotlib.axis.XTick object at 0x127c4c130>, <matplotlib.axis.XTick object at 0x125d30ca0>, <matplotlib.axis.XTick object at 0x125d30c70>, <matplotlib.axis.XTick object at 0x125df3e50>, <matplotlib.axis.XTick object at 0x125df3280>, <matplotlib.axis.XTick object at 0x125df3400>, <matplotlib.axis.XTick object at 0x128654c70>, <matplotlib.axis.XTick object at 0x128654a60>, <matplotlib.axis.XTick object at 0x125df3a90>, <matplotlib.axis.XTick object at 0x125d301c0>, <matplotlib.axis.XTick object at 0x128654e20>, <matplotlib.axis.XTick object at 0x13129f1c0>, <matplotlib.axis.XTick object at 0x13129f6a0>, <matplotlib.axis.XTick object at 0x12a695490>], <a list of 21 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([], <a list of 0 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([], <a list of 0 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([<matplotlib.axis.XTick object at 0x1269ae520>, <matplotlib.axis.XTick object at 0x1269aea00>, <matplotlib.axis.XTick object at 0x1269ae580>, <matplotlib.axis.XTick object at 0x12e176c70>, <matplotlib.axis.XTick object at 0x12e166250>, <matplotlib.axis.XTick object at 0x12e1667f0>, <matplotlib.axis.XTick object at 0x12e166d90>, <matplotlib.axis.XTick object at 0x12e16b370>, <matplotlib.axis.XTick object at 0x12e16b910>, <matplotlib.axis.XTick object at 0x12e16beb0>, <matplotlib.axis.XTick object at 0x12e163490>, <matplotlib.axis.XTick object at 0x12e163a30>, <matplotlib.axis.XTick object at 0x12e16b580>, <matplotlib.axis.XTick object at 0x12e1663d0>, <matplotlib.axis.XTick object at 0x12e1635b0>, <matplotlib.axis.XTick object at 0x12e154370>, <matplotlib.axis.XTick object at 0x12e154910>, <matplotlib.axis.XTick object at 0x12e154eb0>, <matplotlib.axis.XTick object at 0x12e156490>, <matplotlib.axis.XTick object at 0x12e156a30>, <matplotlib.axis.XTick object at 0x12e156fd0>], <a list of 21 Text xticklabel objects>)
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1), ncol=3)
plt.tight_layout()
plt.show()

Functional profiling

Function is profiled using:
1. HUMAnN2
- Uniref90 database
- Uniref50 database
2. HUMAnN3
- Uniref90 and Uniref50
- This is still running
2. MEGAHIT and the Anvi’o TARA oceans workflow
- This is still running

HUMAnN2 unaligned reads

plt.figure(figsize=(8,6))
ax1 = plt.subplot(111)
#unaligned_after_translation_uniref_90
#unaligned_after_translation_uniref_50
x1 = [x for x in range(21)]
x2 = [x+0.4 for x in range(21)]
x = [x+0.2 for x in range(21)]

ax1.bar(x1, reads.loc[:, 'unaligned_after_translation_uniref_90'].values, color=colors, edgecolor='k', width=0.4)
ax1.bar(x2, reads.loc[:, 'unaligned_after_translation_uniref_50'].values, color=colors, edgecolor='k', width=0.4, alpha=0.5)
plt.xticks(x, sample_names, rotation=90)
plt.ylabel('Unaligned reads after translation (%)')
plt.xlim([-0.5,21])
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
ax1.legend(handles=handles, bbox_to_anchor=(1,1))

plt.tight_layout()
plt.show()

## HUMAnN2 richness {.tabset}

After running HUMAnN2, the pathabundance, pathcoverage and genefamilies tables are joined, and the pathabundance is renormalised (relative abundance calculated) and split to tables that are stratified by species or not. Here we use the unstratified pathabundance file (the MetaPhlAn2 species results don’t seem to have been particularly accurate). The genefamilies file is stratified by default so we take only the overall values for the gene family (removing all species results) and re-calculate these values as relative abundances.

Gene family richness

def get_richness(df):
  snames = list(df.columns)
  num_genes = []
  for sn in snames:
    one_sample = pd.DataFrame(df.loc[:, sn])
    one_sample = one_sample[one_sample.max(axis=1) > 0]
    num_genes.append(one_sample.shape[0])
  return snames, num_genes

genes_90 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/humann2_genefamilies.tsv', header=0, index_col=0, sep='\t')
genes_90.drop('UNMAPPED', axis=0, inplace=True)
gene_names = list(genes_90.index.values)
keeping = []
for gn in gene_names:
    new_gn = gn.split('|')[0]
    if gn == new_gn: keeping.append(True)
    else: keeping.append(False)
genes_90_abs = genes_90.loc[keeping, :]
genes_90 = genes_90_abs.divide(genes_90_abs.sum(axis=0)).multiply(100)
pathways_90 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/humann2_pathabundance_relab_unstratified.tsv', header=0, index_col=0, sep='\t')

snames_genes_90, genes_90_richness = get_richness(genes_90)
snames_pways_90, pways_90_richness = get_richness(pathways_90)

genes_50 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_50/humann2_genefamilies.tsv', header=0, index_col=0, sep='\t')
genes_50.drop('UNMAPPED', axis=0, inplace=True)
gene_names = list(genes_50.index.values)
keeping = []
for gn in gene_names:
    new_gn = gn.split('|')[0]
    if gn == new_gn: keeping.append(True)
    else: keeping.append(False)
genes_50_abs = genes_50.loc[keeping, :]
genes_50 = genes_50_abs.divide(genes_50_abs.sum(axis=0)).multiply(100)
pathways_50 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_50/humann2_pathabundance_relab_unstratified.tsv', header=0, index_col=0, sep='\t')

snames_genes_50, genes_50_richness = get_richness(genes_50)
snames_pways_50, pways_50_richness = get_richness(pathways_50)
plt.figure(figsize=(6,6))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

x = [a for a in range(len(genes_90.columns))]
x1 = [a+0.4 for a in x]

ax1.bar(x, genes_90_richness, width=0.4, color=colors, edgecolor='k')
ax2.bar(x, genes_90_richness, width=0.4, color=colors, edgecolor='k')
ax1.bar(x1[:len(genes_50_richness)], genes_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
ax2.bar(x1[:len(genes_50_richness)], genes_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
plt.sca(ax1)
plt.xlim([-0.5,21]), plt.xticks([])
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
plt.legend(handles=handles, bbox_to_anchor=(1,1))
plt.ylabel('Gene family\nrichness')

plt.sca(ax2)
plt.semilogy()
plt.xlim([-0.5,21])
plt.ylabel('Gene family\nrichness (log scale)')
plt.xticks(x, sample_names, rotation=90)
plt.tight_layout()
plt.show()

Pathway richness

#54
plt.figure(figsize=(6,6))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

x = [a for a in range(len(genes_90.columns))]
x1 = [a+0.4 for a in x]

ax1.bar(x, pways_90_richness, width=0.4, color=colors, edgecolor='k')
ax1.bar(x1, pways_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
ax2.bar(x, pways_90_richness, width=0.4, color=colors, edgecolor='k')
ax2.bar(x1, pways_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
plt.sca(ax1)
plt.xticks([])
plt.ylabel('Pathway richness')
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
plt.legend(handles=handles, bbox_to_anchor=(1,1))
plt.xlim([-0.5,21])
plt.sca(ax2)
plt.semilogy()
plt.xlim([-0.5,21]), plt.xticks(x, sample_names, rotation=90)
plt.ylabel('Pathway richness (log scale)')
plt.tight_layout()
plt.show()

HUMAnN2 differential abundance between body sites

Now we are looking at the gene families and pathways that are differentially abundant between body sites, using the lists that are output by HUMAnN2.

Number of differentially abundant pathways between body sites (Uniref90)

Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.

def get_diff_abundant(genes, name_csv):
    col_names = list(genes.columns)
    col_names_dict = {}
    for cn in col_names:
        if 'SRR8595488' in cn:
          genes.drop([cn], axis=1, inplace=True)
        cn_new = cn.split('_')[0]
        col_names_dict[cn] = site_dict[cn_new]
    new_genes = genes.rename(columns=col_names_dict)
    comparisons = [['Saliva', 'Blood'], ['Buccal', 'Blood'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk']]
    fig = plt.figure(figsize=(15,8))
    ax = [plt.subplot2grid((2,8), (0,1), colspan=2), plt.subplot2grid((2,8), (0,3), colspan=2), plt.subplot2grid((2,8), (0,5), colspan=2), plt.subplot2grid((2,8), (1,0), colspan=2), plt.subplot2grid((2,8), (1,2), colspan=2), plt.subplot2grid((2,8), (1,4), colspan=2), plt.subplot2grid((2,8), (1,6), colspan=2)]
    for a in range(len(comparisons)):
        ax[a].set_title(comparisons[a][0]+r' $vs$ '+comparisons[a][1])
        this_comparison = new_genes.loc[:, comparisons[a]]
        this_comparison = this_comparison[this_comparison.max(axis=1) > 0]
        genes_comp = list(this_comparison.index.values)
        fc, pval, colors_p, this_fc = [], [], [], []
        for b in range(len(genes_comp)):
            c1, c2 = list(this_comparison.loc[genes_comp[b], comparisons[a][0]]), list(this_comparison.loc[genes_comp[b], comparisons[a][1]])
            
            c1, c2 = [0.000001 if x == 0 else x for x in c1], [0.000001 if x == 0 else x for x in c2]
            c1, c2 = [math.log2(c1[c]) for c in range(len(c1))], [math.log2(c2[c]) for c in range(len(c2))]
            t, p = stats.ttest_ind(c1, c2)
            c1, c2 = np.median(c1), np.median(c2)
            diff = c1/c2
            if diff < 1 and diff > 0: diff = -(1/diff)
            fc.append(diff), pval.append(p)
            this_fc.append([genes_comp[b], diff, p])
        colors_p_all = [colors_dict[comparisons[a][0]], colors_dict[comparisons[a][1]]]
        for c in range(len(fc)):
            if fc[c] >= 2 and pval[c] <= 0.05: colors_p.append(colors_p_all[1])
            elif fc[c] <= -2 and pval[c] <= 0.05: colors_p.append(colors_p_all[0])
            else: colors_p.append('#B1B1B1')
            pval[c] = -math.log10(pval[c])
        com = comparisons[a][0]+'_'+comparisons[a][1]
        if a == 0:
          df_pval = pd.DataFrame(this_fc, columns=['Genes/pathways', com+'_FC', com+'_p'])
          df_pval.set_index('Genes/pathways', inplace=True)
        else:
          new_df = pd.DataFrame(this_fc, columns=['Genes/pathways', com+'_FC', com+'_p'])
          new_df.set_index('Genes/pathways', inplace=True)
          df_pval = pd.concat([df_pval, new_df], axis=1, join='outer')
        ma = max([max(fc), abs(min(fc))])+0.5
        if ma > 10: ma = 10.5
        ax[a].set_xlim([-ma, ma])
        ax[a].scatter(fc, pval, marker='o', c=colors_p, s=10)
        ax[a].set_xlabel(r'Log$_2$ fold change')
        if a == 0 or a == 3:
          ax[a].set_ylabel('-log(p value)')
        plt.sca(ax[a])
    df_pval.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/'+name_csv+'.csv')
    handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
    ax[2].legend(handles=handles, bbox_to_anchor=(1.04,1), loc='upper left')
    return
pways = pd.DataFrame(pathways_90)
get_diff_abundant(pways, 'pathways_90')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

Number of differentially abundant pathways between body sites (Uniref50)

Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.

pways = pd.DataFrame(pathways_50)
get_diff_abundant(pways, 'pathways_50')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

Number of differentially abundant gene families between body sites (Uniref90)

Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.

pways = pd.DataFrame(genes_90)
get_diff_abundant(pways, 'genes_90')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

Number of differentially abundant gene families between body sites (Uniref50)

Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.

pways = pd.DataFrame(genes_50)
get_diff_abundant(pways, 'genes_50')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

HUMAnN3 unaligned reads

HUMAnN3 richness

Gene family richness

Pathway richness

HUMAnN3 differential abundance between body sites

Number of differentially abundant pathways between body sites

Number of differentially abundant gene families between body sites

Anvi’o number of…

Contigs

MAGs

Anvi’o richness

Gene family richness

Pathway richness

Anvi’o differential abundance between body sites

Number of differentially abundant pathways between body sites

Number of differentially abundant gene families between body sites